suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(RColorBrewer)
library(tidyverse)
library(VennDiagram)
})
suppressMessages({
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/plotMA.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
p <- ggplot(bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,])),
aes(x=Time,y=value,col=Time,group=Time)) +
geom_point() + geom_smooth() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}
"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE){
if(length(contrast)==1){
res <- results(dds,name=contrast)
} else {
res <- results(dds,contrast=contrast)
}
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
if(debug){
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE genes"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE genes per interval"))
}
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE genes per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) &
res$baseMean >= expression_cutoff
if(verbose){
message(sprintf("There are %s genes that are DE with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s",
sum(sel),
lfc,padj,expression_cutoff))
}
val <- rowSums(vst[sel,sample_sel])==0
if (sum(val) >0){
warning(sprintf("There are %s DE genes that have no vst expression in the selected samples",sum(val)))
sel[sel][val] <- FALSE
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
}
if(plot){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel]
)
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
load(here("data/analysis/salmon/dds-sample-swap-corrected.rda"))
vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/vst-aware.rda"))
We do not have any goi at that time
# goi <- read_lines(here("doc/goi.txt"))
# stopifnot(all(goi %in% rownames(vst)))
# dev.null <- lapply(goi,line_plot,dds=dds,vst=vst)
dds <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)
Check the different contrasts
resultsNames(dds)
## [1] "Intercept" "Time_120hrs_vs_std" "Time_12hrs_vs_std"
## [4] "Time_24hrs_vs_std" "Time_4hrs_vs_std" "Time_60min_vs_std"
## [7] "Time_72hrs_vs_std"
# fail
T1vsCtl <- extract_results(dds=dds,vst=vst,
contrast="Time_60min_vs_std",
default_prefix="DE-Time-1h-vs-std",
labels=paste(dds$Time,dds$Replicate,sep="-"),
sample_sel=dds$Time %in% c("60min","std"))
## Loading required package: LSD
## The independent filtering cutoff is 0.96719, removing 15.61273% of the data
## The independent filtering maximises for 19.30516 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 10728 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
## Warning in extract_results(dds = dds, vst = vst, contrast =
## "Time_60min_vs_std", : There are 53 DE genes that have no vst expression in the
## selected samples
T4vs1 <- extract_results(dds=dds,vst=vst,
contrast=list("Time_4hrs_vs_std","Time_60min_vs_std"),
default_prefix="DE-Time-4h-vs-1h",
labels=paste(dds$Time,dds$Replicate,sep="-"),
sample_sel=dds$Time %in% c("4hrs","60min"))
## The independent filtering cutoff is 1.24531, removing 17.45895% of the data
## The independent filtering maximises for 21.15138 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 10962 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
T12vs4 <- extract_results(dds=dds,vst=vst,
contrast=list("Time_12hrs_vs_std","Time_4hrs_vs_std"),
default_prefix="DE-Time-12h-vs-4h",
labels=paste(dds$Time,dds$Replicate,sep="-"),
sample_sel=dds$Time %in% c("12hrs","4hrs"))
## The independent filtering cutoff is 1.57275, removing 19.30516% of the data
## The independent filtering maximises for 22.9976 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 9583 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
T24vs12 <- extract_results(dds=dds,vst=vst,contrast=list("Time_24hrs_vs_std","Time_12hrs_vs_std"),
default_prefix="DE-Time-24h-vs-12h",
labels=paste(dds$Time,dds$Replicate,sep="-"),
sample_sel=dds$Time %in% c("24hrs","12hrs"))
## The independent filtering cutoff is 1.24531, removing 17.45895% of the data
## The independent filtering maximises for 21.15138 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 9701 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
T72vs24 <- extract_results(dds=dds,vst=vst,contrast=list("Time_72hrs_vs_std","Time_24hrs_vs_std"),
default_prefix="DE-Time-72h-vs-24h",
labels=paste(dds$Time,dds$Replicate,sep="-"),
sample_sel=dds$Time %in% c("72hrs","24hrs"))
## The independent filtering cutoff is 1.24531, removing 17.45895% of the data
## The independent filtering maximises for 21.15138 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 4944 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
T120vs72 <- extract_results(dds=dds,vst=vst,contrast=list("Time_120hrs_vs_std","Time_72hrs_vs_std"),
default_prefix="DE-Time-120h-vs-72h",
labels=paste(dds$Time,dds$Replicate,sep="-"),
sample_sel=dds$Time %in% c("120hrs","72hrs"))
## The independent filtering cutoff is 2.36286, removing 22.9976% of the data
## The independent filtering maximises for 34.07489 % of the data, corresponding to a base mean expression of 1.82808 (library-size normalised read)
## There are 534 genes that are DE with the following parameters: FDR <= 0.5, |log2FC| >= 0.01, base mean expression > 0
res.list <- list(T1vsCtl=T1vsCtl,
T4vs1=T4vs1,
T12vs4=T12vs4,
T24vs12=T24vs12,
T72vs24=T72vs24,
T120vs72=T120vs72)
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[1:2],"[[","all"),
NULL,
fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[1:2],"[[","up"),
NULL,
fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[1:2],"[[","dn"),
NULL,
fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[2:4],"[[","all"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[2:4],"[[","up"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[2:4],"[[","dn"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[4:6],"[[","all"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[4:6],"[[","up"),
NULL,
fill=pal[1:3]))
grid.newpage()
grid.draw(venn.diagram(lapply(res.list[4:6],"[[","dn"),
NULL,
fill=pal[1:3]))
background <- rownames(vst)[featureSelect(vst,dds$Time,exp=0.1)]
enr.list <- lapply(res.list,function(r){
lapply(r,gopher,background=background,task="go",url="algae")
})
## Loading required package: jsonlite
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
## No enrichments found in task: go
dev.null <- lapply(names(enr.list),function(n){
r <- enr.list[[n]]
if(!is.null(r$all$go)){
write_delim(r$all$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"-all-DE-genes_GO-enrichment.txt")))))
write_delim(r$all$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"-all-DE-genes_GO-enrichment_for-REVIGO.txt")))))
}
if(!is.null(r$up$go)){
write_csv(r$up$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"-up-DE-genes_GO-enrichment.txt")))))
write_delim(r$up$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"-up-DE-genes_GO-enrichment_for-REVIGO.txt")))))
}
if(!is.null(r$dn$go)){
write_csv(r$dn$go,path=file.path(file.path(here("data/analysis/DE",
paste0(n,"-down-DE-genes_GO-enrichment.txt")))))
write_delim(r$dn$go[,c("id","padj")],path=file.path(file.path(here("data/analysis/DE",
paste0(n,"-down-DE-genes_GO-enrichment_for-REVIGO.txt")))))
}
})
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] jsonlite_1.6 LSD_4.0-0
## [3] limma_3.42.0 VennDiagram_1.6.20
## [5] futile.logger_1.4.3 forcats_0.4.0
## [7] stringr_1.4.0 dplyr_0.8.3
## [9] purrr_0.3.3 readr_1.3.1
## [11] tidyr_1.0.0 tibble_2.1.3
## [13] tidyverse_1.3.0 RColorBrewer_1.1-2
## [15] hyperSpec_0.99-20180627 ggplot2_3.2.1
## [17] lattice_0.20-38 here_0.1
## [19] gplots_3.0.1.1 DESeq2_1.26.0
## [21] SummarizedExperiment_1.16.0 DelayedArray_0.12.0
## [23] BiocParallel_1.20.0 matrixStats_0.55.0
## [25] Biobase_2.46.0 GenomicRanges_1.38.0
## [27] GenomeInfoDb_1.22.0 IRanges_2.20.1
## [29] S4Vectors_0.24.0 BiocGenerics_0.32.0
## [31] data.table_1.12.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.2
## [4] XVector_0.26.0 base64enc_0.1-3 fs_1.3.1
## [7] rstudioapi_0.10 farver_2.0.1 bit64_0.9-7
## [10] AnnotationDbi_1.48.0 lubridate_1.7.4 xml2_1.2.2
## [13] splines_3.6.1 geneplotter_1.64.0 knitr_1.26
## [16] zeallot_0.1.0 Formula_1.2-3 broom_0.5.2
## [19] annotate_1.64.0 cluster_2.1.0 dbplyr_1.4.2
## [22] compiler_3.6.1 httr_1.4.1 backports_1.1.5
## [25] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [28] cli_1.1.0 formatR_1.7 acepack_1.4.1
## [31] htmltools_0.4.0 tools_3.6.1 gtable_0.3.0
## [34] glue_1.3.1 GenomeInfoDbData_1.2.2 Rcpp_1.0.3
## [37] cellranger_1.1.0 vctrs_0.2.0 gdata_2.18.0
## [40] nlme_3.1-142 xfun_0.11 testthat_2.3.0
## [43] rvest_0.3.5 lifecycle_0.1.0 gtools_3.8.1
## [46] XML_3.98-1.20 zlibbioc_1.32.0 scales_1.1.0
## [49] hms_0.5.2 lambda.r_1.2.4 curl_4.2
## [52] yaml_2.2.0 memoise_1.1.0 gridExtra_2.3
## [55] rpart_4.1-15 latticeExtra_0.6-28 stringi_1.4.3
## [58] RSQLite_2.1.2 highr_0.8 genefilter_1.68.0
## [61] checkmate_1.9.4 caTools_1.17.1.2 rlang_0.4.2
## [64] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [67] labeling_0.3 htmlwidgets_1.5.1 bit_1.1-14
## [70] tidyselect_0.2.5 magrittr_1.5 R6_2.4.1
## [73] generics_0.0.2 Hmisc_4.3-0 DBI_1.0.0
## [76] pillar_1.4.2 haven_2.2.0 foreign_0.8-72
## [79] withr_2.1.2 survival_3.1-7 RCurl_1.95-4.12
## [82] nnet_7.3-12 modelr_0.1.5 crayon_1.3.4
## [85] futile.options_1.0.1 KernSmooth_2.23-16 rmarkdown_1.18
## [88] locfit_1.5-9.1 readxl_1.3.1 blob_1.2.0
## [91] reprex_0.3.0 digest_0.6.23 xtable_1.8-4
## [94] munsell_0.5.0